{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 2.9 \u5c06Unicode\u6587\u672c\u6807\u51c6\u5316\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### \u95ee\u9898\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\u4f60\u6b63\u5728\u5904\u7406Unicode\u5b57\u7b26\u4e32\uff0c\u9700\u8981\u786e\u4fdd\u6240\u6709\u5b57\u7b26\u4e32\u5728\u5e95\u5c42\u6709\u76f8\u540c\u7684\u8868\u793a\u3002"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### \u89e3\u51b3\u65b9\u6848\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\u5728Unicode\u4e2d\uff0c\u67d0\u4e9b\u5b57\u7b26\u80fd\u591f\u7528\u591a\u4e2a\u5408\u6cd5\u7684\u7f16\u7801\u8868\u793a\u3002\u4e3a\u4e86\u8bf4\u660e\uff0c\u8003\u8651\u4e0b\u9762\u7684\u8fd9\u4e2a\u4f8b\u5b50\uff1a"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "s1 = 'Spicy Jalape\\u00f1o'\ns2 = 'Spicy Jalapen\\u0303o'\ns1"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "s2"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "s1 == s2"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "len(s1)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "len(s2)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\u8fd9\u91cc\u7684\u6587\u672c\u201dSpicy Jalape\u00f1o\u201d\u4f7f\u7528\u4e86\u4e24\u79cd\u5f62\u5f0f\u6765\u8868\u793a\u3002\n\u7b2c\u4e00\u79cd\u4f7f\u7528\u6574\u4f53\u5b57\u7b26\u201d\u00f1\u201d(U+00F1)\uff0c\u7b2c\u4e8c\u79cd\u4f7f\u7528\u62c9\u4e01\u5b57\u6bcd\u201dn\u201d\u540e\u9762\u8ddf\u4e00\u4e2a\u201d~\u201d\u7684\u7ec4\u5408\u5b57\u7b26(U+0303)\u3002"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\u5728\u9700\u8981\u6bd4\u8f83\u5b57\u7b26\u4e32\u7684\u7a0b\u5e8f\u4e2d\u4f7f\u7528\u5b57\u7b26\u7684\u591a\u79cd\u8868\u793a\u4f1a\u4ea7\u751f\u95ee\u9898\u3002\n\u4e3a\u4e86\u4fee\u6b63\u8fd9\u4e2a\u95ee\u9898\uff0c\u4f60\u53ef\u4ee5\u4f7f\u7528unicodedata\u6a21\u5757\u5148\u5c06\u6587\u672c\u6807\u51c6\u5316\uff1a"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import unicodedata\nt1 = unicodedata.normalize('NFC', s1)\nt2 = unicodedata.normalize('NFC', s2)\nt1 == t2"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "print(ascii(t1))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "t3 = unicodedata.normalize('NFD', s1)\nt4 = unicodedata.normalize('NFD', s2)\nt3 == t4"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "print(ascii(t3))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "normalize() \u7b2c\u4e00\u4e2a\u53c2\u6570\u6307\u5b9a\u5b57\u7b26\u4e32\u6807\u51c6\u5316\u7684\u65b9\u5f0f\u3002\nNFC\u8868\u793a\u5b57\u7b26\u5e94\u8be5\u662f\u6574\u4f53\u7ec4\u6210(\u6bd4\u5982\u53ef\u80fd\u7684\u8bdd\u5c31\u4f7f\u7528\u5355\u4e00\u7f16\u7801)\uff0c\u800cNFD\u8868\u793a\u5b57\u7b26\u5e94\u8be5\u5206\u89e3\u4e3a\u591a\u4e2a\u7ec4\u5408\u5b57\u7b26\u8868\u793a\u3002"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Python\u540c\u6837\u652f\u6301\u6269\u5c55\u7684\u6807\u51c6\u5316\u5f62\u5f0fNFKC\u548cNFKD\uff0c\u5b83\u4eec\u5728\u5904\u7406\u67d0\u4e9b\u5b57\u7b26\u7684\u65f6\u5019\u589e\u52a0\u4e86\u989d\u5916\u7684\u517c\u5bb9\u7279\u6027\u3002\u6bd4\u5982\uff1a"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "s = '\\ufb01' # A single character\ns"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "unicodedata.normalize('NFD', s)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "unicodedata.normalize('NFKD', s)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "unicodedata.normalize('NFKC', s)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### \u8ba8\u8bba\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\u6807\u51c6\u5316\u5bf9\u4e8e\u4efb\u4f55\u9700\u8981\u4ee5\u4e00\u81f4\u7684\u65b9\u5f0f\u5904\u7406Unicode\u6587\u672c\u7684\u7a0b\u5e8f\u90fd\u662f\u975e\u5e38\u91cd\u8981\u7684\u3002\n\u5f53\u5904\u7406\u6765\u81ea\u7528\u6237\u8f93\u5165\u7684\u5b57\u7b26\u4e32\u800c\u4f60\u5f88\u96be\u53bb\u63a7\u5236\u7f16\u7801\u7684\u65f6\u5019\u5c24\u5176\u5982\u6b64\u3002"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\u5728\u6e05\u7406\u548c\u8fc7\u6ee4\u6587\u672c\u7684\u65f6\u5019\u5b57\u7b26\u7684\u6807\u51c6\u5316\u4e5f\u662f\u5f88\u91cd\u8981\u7684\u3002\n\u6bd4\u5982\uff0c\u5047\u8bbe\u4f60\u60f3\u6e05\u9664\u6389\u4e00\u4e9b\u6587\u672c\u4e0a\u9762\u7684\u53d8\u97f3\u7b26\u7684\u65f6\u5019(\u53ef\u80fd\u662f\u4e3a\u4e86\u641c\u7d22\u548c\u5339\u914d)\uff1a"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "t1 = unicodedata.normalize('NFD', s1)\n''.join(c for c in t1 if not unicodedata.combining(c))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\u6700\u540e\u4e00\u4e2a\u4f8b\u5b50\u5c55\u793a\u4e86 unicodedata \u6a21\u5757\u7684\u53e6\u4e00\u4e2a\u91cd\u8981\u65b9\u9762\uff0c\u4e5f\u5c31\u662f\u6d4b\u8bd5\u5b57\u7b26\u7c7b\u7684\u5de5\u5177\u51fd\u6570\u3002\ncombining() \u51fd\u6570\u53ef\u4ee5\u6d4b\u8bd5\u4e00\u4e2a\u5b57\u7b26\u662f\u5426\u4e3a\u548c\u97f3\u5b57\u7b26\u3002\n\u5728\u8fd9\u4e2a\u6a21\u5757\u4e2d\u8fd8\u6709\u5176\u4ed6\u51fd\u6570\u7528\u4e8e\u67e5\u627e\u5b57\u7b26\u7c7b\u522b\uff0c\u6d4b\u8bd5\u662f\u5426\u4e3a\u6570\u5b57\u5b57\u7b26\u7b49\u7b49\u3002"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Unicode\u663e\u7136\u662f\u4e00\u4e2a\u5f88\u5927\u7684\u4e3b\u9898\u3002\u5982\u679c\u60f3\u66f4\u6df1\u5165\u7684\u4e86\u89e3\u5173\u4e8e\u6807\u51c6\u5316\u65b9\u9762\u7684\u4fe1\u606f\uff0c\n\u8bf7\u770b\u8003 Unicode\u5b98\u7f51\u4e2d\u5173\u4e8e\u8fd9\u90e8\u5206\u7684\u8bf4\u660e\nNed Batchelder\u5728 \u4ed6\u7684\u7f51\u7ad9\n\u4e0a\u5bf9Python\u7684Unicode\u5904\u7406\u95ee\u9898\u4e5f\u6709\u4e00\u4e2a\u5f88\u597d\u7684\u4ecb\u7ecd\u3002"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.1"
    },
    "toc": {
      "base_numbering": 1,
      "nav_menu": {},
      "number_sections": true,
      "sideBar": true,
      "skip_h1_title": true,
      "title_cell": "Table of Contents",
      "title_sidebar": "Contents",
      "toc_cell": false,
      "toc_position": {},
      "toc_section_display": true,
      "toc_window_display": true
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
}